*===============================================================================
*=== FILE 6 --- FINALIZE FOR-RENT DATA 
*===============================================================================
clear all
set more off

cap cd ""


* continue with preprocessed example data, no values are real
use "data\preprocessed\pseudo\_FR_FourStates_Masterfile.dta", clear

* cleaning
drop word_count pair_count no_uw6ft all_words all_pairs housenumber housenumberfraction streetdirectionprefix streetsuffix streetdirectionsuffix unitnumber
destring(rentalprice), replace force

gen   ln_rental_price = ln(rentalprice)
label variable ln_rental_price "ln(RentalPrice)"

* drop irrelevant observations
drop if rentalprice < 500 | rentalprice > 10000 | rentalprice == .

* generate
destring(bedroomcnt bathroomcnt), replace force
replace bedroomcnt = 6 if bedroomcnt > 6 & bedroomcnt != .
replace bedroomcnt = 99 if bedroomcnt == .

replace bathroomcnt = 7 if bathroomcnt > 7 & bathroomcnt != .
replace bathroomcnt = 99 if bathroomcnt == .

replace bathroomcnt = round(bathroomcnt, 0.5)

destring(finishedsq lotsizesquarefeet yearbuilt majorremodelyear), replace force

xtile lotsizesquarefeet_pct = lotsizesquarefeet, n(10)
replace lotsizesquarefeet_pct = 99 if lotsizesquarefeet_pct == .

xtile finishedsquarefeet_pct = finishedsq, n(20)
replace finishedsquarefeet_pct = 99 if finishedsquarefeet_pct == .

gen     prop_age = listing_year - yearbuilt
replace prop_age = 0 if prop_age < 0
replace prop_age = 98 if prop_age > 98 & prop_age != .
replace prop_age = 99 if prop_age == .
label variable prop_age "PropertyAge"

egen prop_age_group = cut(prop_age), at(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,99)
replace prop_age_group = 99 if prop_age_group == .
label variable prop_age_group "PropertyAge_grouped"

gen remodel_age = listing_year - majorremodelyear
replace remodel_age = prop_age if prop_age < remodel_age & remodel_age != . & yearbuilt != .
replace remodel_age = prop_age if remodel_age == . & yearbuilt != .
replace remodel_age = 0 if remodel_age < 0
replace remodel_age = 98 if remodel_age > 98 & remodel_age != .
replace remodel_age = 99 if remodel_age == .
label variable remodel_age "YearsSinceMajorRemodel"

egen remodel_age_group = cut(remodel_age), at(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,99)
replace remodel_age_group = 99 if remodel_age_group == .
label variable remodel_age_group "YearsSinceMajorRemodel_grouped"

*merge county info
merge m:1 postalcode using "data\actual\county_zip"
drop if _merge == 2
drop _merge

* create fixed effects
egen city_quarter_fe = group(city listing_quarter state)
egen zip_quarter_fe  = group(postalcode listing_quarter state)

*cluster
egen county_year = group(county listing_year)
egen zip_year = group(postalcode listing_year)
egen zip_quarter = group(postalcode listing_quarter)

* rename indexes so we can differentiate them from indexes based on sales listing
rename index_state_year index_state_year_FR
rename index_state_quarter index_state_quarter_FR
rename index_zipcode_year index_zipcode_year_FR
rename index_zipcode_quarter index_zipcode_quarter_FR

* merge sales listing based indexes
merge m:m state listing_quarter using "data/preprocessed/actual/state_index_quarterly"
drop if _merge == 2
drop _merge

merge m:m state listing_year using "data/preprocessed/actual/state_index_yearly"
drop if _merge == 2
drop _merge

merge m:m postalcode_int listing_quarter using "data/preprocessed/actual/zipcode_index_quarterly"
drop if _merge == 2
drop _merge

merge m:m postalcode_int listing_year using "data/preprocessed/actual/zipcode_index_yearly"
drop if _merge == 2
drop _merge

*re-rename
rename index_state_year index_state_year_FS
rename index_state_quarter index_state_quarter_FS
rename index_zipcode_year index_zipcode_year_FS
rename index_zipcode_quarter index_zipcode_quarter_FS

* create interactions
gen interaction5_FR = uw6ft * index_state_year_FR
gen interaction6_FR = uw6ft * index_state_quarter_FR
gen interaction7_FR = uw6ft * index_zipcode_year_FR
gen interaction8_FR = uw6ft * index_zipcode_quarter_FR
gen interaction5_FS = uw6ft * index_state_year_FS
gen interaction6_FS = uw6ft * index_state_quarter_FS
gen interaction7_FS = uw6ft * index_zipcode_year_FS
gen interaction8_FS = uw6ft * index_zipcode_quarter_FS

* label
label variable uw6ft "flood_zone"

label variable interaction5_FR "flood_zone*index_state_year_FR"
label variable interaction6_FR "flood_zone*index_state_quarter_FR"
label variable interaction7_FR "flood_zone*index_zipcode_year_FR"
label variable interaction8_FR "flood_zone*index_zipcode_quarter_FR"

label variable interaction5_FS "flood_zone*index_state_year_FS"
label variable interaction6_FS "flood_zone*index_state_quarter_FS"
label variable interaction7_FS "flood_zone*index_zipcode_year_FS"
label variable interaction8_FS "flood_zone*index_zipcode_quarter_FS"

* merge apt variables
* the prop_fact file is a preprocessed example data, no values are real
merge m:m propertyid using "data\preprocessed\pseudo\PropFacts.dta", keepusing(apt apt2 apt3)
drop if _merge == 2
drop _merge

* drop repeated listings
gen date = date(listingcreationdate, "YMD")
format date %td

bysort propertyid (date): gen select = 1 if _n == 1

replace select = 1 if select ==. & (date-date[_n-1] > 180 | rentalprice != rentalprice[_n-1])

keep if select == 1

drop date select

compress

save "replication output\rent_regression_data", replace

